import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import os
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

if not os.path.exists("figures"):
    os.makedirs("figures")

theme_colors = px.colors.qualitative.G10
template = "plotly_white"
df = pd.read_csv("data/clean_lightcast_job_postings.csv")

print("Dataset shape:", df.shape)
print("\nSalary statistics:")
print(df['SALARY'].describe())
Dataset shape: (69198, 109)

Salary statistics:
count     69198.000000
mean     116806.493352
std       29426.450593
min       15860.000000
25%      116300.000000
50%      116300.000000
75%      116300.000000
max      500000.000000
Name: SALARY, dtype: float64
employment_counts = df['EMPLOYMENT_TYPE_NAME'].value_counts().reset_index()
employment_counts.columns = ['Employment Type', 'Count']

figa = px.bar(employment_counts, x='Employment Type', y='Count', 
              title='Job Postings by Employment Type',
              color='Employment Type', color_discrete_sequence=theme_colors,
              template=template)
figa.update_layout(
    xaxis_title="Employment Type",
    yaxis_title="Number of Postings",
    legend_title="Employment Type",
    font=dict(family="Arial, sans-serif", size=12)
)
figa.write_html("figures/job_postings_by_employment_type.html")
figa.write_image("figures/job_postings_by_employment_type.png")
figa.show()
salary_df = df[df['SALARY'].notna() & (df['SALARY'] > 0) & (df['SALARY'] < 500000)]

figb = px.box(salary_df, x="CITY_NAME", y="SALARY", 
              title="Salary Distribution by City",
              color_discrete_sequence=theme_colors,
              template=template)

# Show only top 10 cities by median salary to make the visualization clearer
top_cities = salary_df.groupby('CITY_NAME')['SALARY'].median().nlargest(10).index.tolist()
figb = px.box(salary_df[salary_df['CITY_NAME'].isin(top_cities)], 
              x="CITY_NAME", y="SALARY", 
              title="Salary Distribution by Top 10 Cities",
              color="CITY_NAME", color_discrete_sequence=theme_colors,
              template=template)
figb.update_layout(
    xaxis_title="City",
    yaxis_title="Salary ($)",
    showlegend=False,
    font=dict(family="Arial, sans-serif", size=12)
)
figb.write_html("figures/salary_distribution_by_city.html")
figb.write_image("figures/salary_distribution_by_city.png")
figb.show()
remote_counts = df['REMOTE_TYPE_NAME'].value_counts().reset_index()
remote_counts.columns = ['Remote Type', 'Count']

figc = px.pie(remote_counts, names='Remote Type', values='Count', 
              title='Distribution of Remote vs. On-Site Jobs',
              color='Remote Type', color_discrete_sequence=theme_colors,
              template=template)
figc.update_layout(
    legend_title="Remote Status",
    font=dict(family="Arial, sans-serif", size=12)
)
figc.write_html("figures/remote_vs_onsite_distribution.html")
figc.write_image("figures/remote_vs_onsite_distribution.png")
figc.show()
# Convert date columns to datetime
date_columns = ['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'MODELED_EXPIRED']
for col in date_columns:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

df['POSTING_MONTH'] = df['POSTED'].dt.to_period('M').astype(str)
# Create a time-based analysis of job postings
monthly_postings = df.groupby('POSTING_MONTH').size().reset_index(name='count')
monthly_postings['POSTING_MONTH'] = pd.to_datetime(monthly_postings['POSTING_MONTH'])
monthly_postings = monthly_postings.sort_values('POSTING_MONTH')
monthly_postings['POSTING_MONTH'] = monthly_postings['POSTING_MONTH'].dt.strftime('%Y-%m')

fig1 = px.line(monthly_postings, x='POSTING_MONTH', y='count', 
               title='Monthly Job Posting Trends',
               labels={'count': 'Number of Postings', 'POSTING_MONTH': 'Month'},
               template=template)
fig1.update_layout(
    xaxis_title="Month",
    yaxis_title="Number of Job Postings",
    legend_title="Legend",
    font=dict(family="Arial, sans-serif", size=12)
)
fig1.write_html("figures/monthly_job_posting_trends.html")
fig1.write_image("figures/monthly_job_posting_trends.png")
fig1.show()
# Employment Type Analysis
employment_counts = df['EMPLOYMENT_TYPE_NAME'].value_counts().reset_index()
employment_counts.columns = ['Employment Type', 'Count']

fig2 = px.bar(employment_counts, x='Employment Type', y='Count', 
              title='Job Postings by Employment Type',
              color='Employment Type', color_discrete_sequence=theme_colors,
              template=template)
fig2.update_layout(
    xaxis_title="Employment Type",
    yaxis_title="Number of Postings",
    legend_title="Employment Type",
    font=dict(family="Arial, sans-serif", size=12)
)
fig2.write_html("figures/job_postings_by_employment_type.html")
fig2.write_image("figures/job_postings_by_employment_type.png")
fig2.show()

# Remote Work Analysis with nicer formatting
remote_counts = df['REMOTE_TYPE_NAME'].value_counts().reset_index()
remote_counts.columns = ['Remote Type', 'Count']

fig3 = px.pie(remote_counts, names='Remote Type', values='Count', 
              title='Distribution of Remote vs. On-Site Jobs',
              color='Remote Type', color_discrete_sequence=theme_colors,
              template=template)
fig3.update_layout(
    legend_title="Remote Status",
    font=dict(family="Arial, sans-serif", size=12)
)
fig3.write_html("figures/remote_vs_onsite_distribution.html")
fig3.write_image("figures/remote_vs_onsite_distribution.png")
fig3.show()
# Salary Analysis
salary_df = df[df['SALARY'].notna() & (df['SALARY'] > 0)]

# Top cities by average salary (with at least 50 job postings)
city_salary = salary_df.groupby('CITY_NAME').agg(
    avg_salary=('SALARY', 'mean'),
    job_count=('SALARY', 'count')
).reset_index()

city_salary_filtered = city_salary[city_salary['job_count'] >= 50].sort_values('avg_salary', ascending=False).head(10)

fig4 = px.bar(city_salary_filtered, x='CITY_NAME', y='avg_salary',
              title='Top 10 Cities by Average Salary (Min. 50 Postings)',
              text='job_count', color='avg_salary', color_continuous_scale='Viridis',
              template=template)
fig4.update_traces(texttemplate='%{text} jobs', textposition='outside')
fig4.update_layout(
    xaxis_title="City",
    yaxis_title="Average Salary ($)",
    coloraxis_showscale=False,
    font=dict(family="Arial, sans-serif", size=12)
)
fig4.write_html("figures/top_cities_by_avg_salary.html")
fig4.write_image("figures/top_cities_by_avg_salary.png")
fig4.show()
# Salary distribution by state
state_salary = salary_df.groupby('STATE_NAME').agg(
    avg_salary=('SALARY', 'mean'),
    median_salary=('SALARY', 'median'),
    job_count=('SALARY', 'count')
).reset_index()

state_salary_filtered = state_salary[state_salary['job_count'] >= 100].sort_values('avg_salary', ascending=False)

fig5 = px.bar(state_salary_filtered, x='STATE_NAME', y='avg_salary',
              title='Average Salary by State (Min. 100 Postings)',
              color='job_count', text='median_salary',
              labels={'job_count': 'Number of Job Postings'},
              color_continuous_scale='Viridis',
              template=template)
fig5.update_traces(texttemplate='$%{text:.0f} median', textposition='outside')
fig5.update_layout(
    xaxis_title="State",
    yaxis_title="Average Salary ($)",
    coloraxis_colorbar_title="Job Count",
    font=dict(family="Arial, sans-serif", size=12)
)
fig5.write_html("figures/avg_salary_by_state.html")
fig5.write_image("figures/avg_salary_by_state.png")
fig5.show()
# Experience requirements analysis
exp_df = df[df['MIN_YEARS_EXPERIENCE'].notna()]

fig6 = px.histogram(exp_df, x='MIN_YEARS_EXPERIENCE',
                   title='Distribution of Minimum Experience Requirements',
                   color_discrete_sequence=theme_colors,
                   template=template)
fig6.update_layout(
    xaxis_title="Minimum Years of Experience",
    yaxis_title="Number of Job Postings",
    bargap=0.1,
    font=dict(family="Arial, sans-serif", size=12)
)
fig6.write_html("figures/min_experience_distribution.html")
fig6.write_image("figures/min_experience_distribution.png")
fig6.show()
# Education level requirements vs. Salary
edu_salary = df[df['SALARY'].notna() & (df['MIN_EDULEVELS_NAME'].notna())].groupby('MIN_EDULEVELS_NAME').agg(
    avg_salary=('SALARY', 'mean'),
    median_salary=('SALARY', 'median'),
    job_count=('SALARY', 'count')
).reset_index()

edu_salary = edu_salary[edu_salary['job_count'] >= 30].sort_values('avg_salary')

fig7 = px.bar(edu_salary, x='MIN_EDULEVELS_NAME', y='avg_salary',
              title='Average Salary by Minimum Education Level',
              color='job_count', text='median_salary',
              labels={'job_count': 'Number of Job Postings', 'MIN_EDULEVELS_NAME': 'Education Level'},
              color_continuous_scale='Viridis',
              template=template)
fig7.update_traces(texttemplate='$%{text:.0f} median', textposition='outside')
fig7.update_layout(
    xaxis_title="Minimum Education Level",
    yaxis_title="Average Salary ($)",
    coloraxis_colorbar_title="Job Count",
    font=dict(family="Arial, sans-serif", size=12)
)
fig7.write_html("figures/salary_by_education.html")
fig7.write_image("figures/salary_by_education.png")
fig7.show()

# Industry analysis
if 'NAICS_2022_2_NAME' in df.columns:
    industry_field = 'NAICS_2022_2_NAME'
else:
    industry_field = 'NAICS2_NAME'

industry_counts = df[df[industry_field].notna()][industry_field].value_counts().reset_index()
industry_counts.columns = ['Industry', 'Count']
industry_counts = industry_counts.head(10)  # Top 10 industries

fig8 = px.bar(industry_counts, x='Count', y='Industry', 
              title='Top 10 Industries by Job Posting Count',
              orientation='h', color='Count', color_continuous_scale='Viridis',
              template=template)
fig8.update_layout(
    xaxis_title="Number of Job Postings",
    yaxis_title="Industry",
    font=dict(family="Arial, sans-serif", size=12),
    yaxis={'categoryorder':'total ascending'}
)
fig8.write_html("figures/top_industries.html")
fig8.write_image("figures/top_industries.png")
fig8.show()

# Salary Box Plot by Remote Type
fig9 = px.box(salary_df, x='REMOTE_TYPE_NAME', y='SALARY',
              title='Salary Distribution by Remote Work Type',
              color='REMOTE_TYPE_NAME', color_discrete_sequence=theme_colors,
              template=template)
fig9.update_layout(
    xaxis_title="Remote Work Type",
    yaxis_title="Salary ($)",
    showlegend=False,
    font=dict(family="Arial, sans-serif", size=12)
)
fig9.write_html("figures/salary_by_remote_type.html")
fig9.write_image("figures/salary_by_remote_type.png")
fig9.show()
# Create a combined visualization: Salary distribution by employment type with average markers
fig10 = px.box(salary_df, x='EMPLOYMENT_TYPE_NAME', y='SALARY',
              title='Salary Distribution by Employment Type',
              color='EMPLOYMENT_TYPE_NAME', color_discrete_sequence=theme_colors,
              template=template)

emp_salary_avg = salary_df.groupby('EMPLOYMENT_TYPE_NAME')['SALARY'].mean().reset_index()
for i, emp_type in enumerate(emp_salary_avg['EMPLOYMENT_TYPE_NAME']):
    avg_salary = emp_salary_avg.loc[emp_salary_avg['EMPLOYMENT_TYPE_NAME'] == emp_type, 'SALARY'].values[0]
    fig10.add_trace(
        go.Scatter(
            x=[emp_type],
            y=[avg_salary],
            mode='markers',
            marker=dict(color='red', size=10, symbol='star'),
            name=f'Average: ${avg_salary:.0f}'
        )
    )

fig10.update_layout(
    xaxis_title="Employment Type",
    yaxis_title="Salary ($)",
    showlegend=False,
    font=dict(family="Arial, sans-serif", size=12)
)
fig10.write_html("figures/salary_by_employment_type.html")
fig10.write_image("figures/salary_by_employment_type.png")
fig10.show()